0-Librairies et premières observations
# usual data science stack in python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# imports of need modules in sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.dummy import DummyClassifier
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
# load main datasets
app_train, app_test = pd.read_csv("./data/application_train.csv"), pd.read_csv("./data/application_test.csv")
display(app_train.head(3))
display(app_test.head(3))
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 rows × 122 columns
SK_ID_CURR | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100001 | Cash loans | F | N | Y | 0 | 135000.0 | 568800.0 | 20560.5 | 450000.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 100005 | Cash loans | M | N | Y | 0 | 99000.0 | 222768.0 | 17370.0 | 180000.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
2 | 100013 | Cash loans | M | Y | Y | 0 | 202500.0 | 663264.0 | 69777.0 | 630000.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 |
3 rows × 121 columns
app_train.shape, app_test.shape
((307511, 122), (48744, 121))
1-Analyse des données
app_train.TARGET.value_counts()
0 282686 1 24825 Name: TARGET, dtype: int64
print("Pourcentage de clients en difficultés de payments:",app_train.TARGET.sum() / app_train.shape[0] * 100 ,"%")
Pourcentage de clients en difficultés de payments: 8.072881945686495 %
plt.title('Distribution de la colonne "TARGET" - 1 -> client avec difficultés de payments / 0 - les autres cas')
sns.countplot(x=app_train.TARGET, data=app_train)
plt.show()
app_train.info()
print("----")
app_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR dtypes: float64(65), int64(41), object(16) memory usage: 286.2+ MB ---- <class 'pandas.core.frame.DataFrame'> RangeIndex: 48744 entries, 0 to 48743 Columns: 121 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR dtypes: float64(65), int64(40), object(16) memory usage: 45.0+ MB
display(app_train.describe())
print("<---->")
display(app_test.describe())
SK_ID_CURR | TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 307511.000000 | 307511.000000 | 307511.000000 | 3.075110e+05 | 3.075110e+05 | 307499.000000 | 3.072330e+05 | 307511.000000 | 307511.000000 | 307511.000000 | ... | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 |
mean | 278180.518577 | 0.080729 | 0.417052 | 1.687979e+05 | 5.990260e+05 | 27108.573909 | 5.383962e+05 | 0.020868 | -16036.995067 | 63815.045904 | ... | 0.008130 | 0.000595 | 0.000507 | 0.000335 | 0.006402 | 0.007000 | 0.034362 | 0.267395 | 0.265474 | 1.899974 |
std | 102790.175348 | 0.272419 | 0.722121 | 2.371231e+05 | 4.024908e+05 | 14493.737315 | 3.694465e+05 | 0.013831 | 4363.988632 | 141275.766519 | ... | 0.089798 | 0.024387 | 0.022518 | 0.018299 | 0.083849 | 0.110757 | 0.204685 | 0.916002 | 0.794056 | 1.869295 |
min | 100002.000000 | 0.000000 | 0.000000 | 2.565000e+04 | 4.500000e+04 | 1615.500000 | 4.050000e+04 | 0.000290 | -25229.000000 | -17912.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 189145.500000 | 0.000000 | 0.000000 | 1.125000e+05 | 2.700000e+05 | 16524.000000 | 2.385000e+05 | 0.010006 | -19682.000000 | -2760.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 278202.000000 | 0.000000 | 0.000000 | 1.471500e+05 | 5.135310e+05 | 24903.000000 | 4.500000e+05 | 0.018850 | -15750.000000 | -1213.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
75% | 367142.500000 | 0.000000 | 1.000000 | 2.025000e+05 | 8.086500e+05 | 34596.000000 | 6.795000e+05 | 0.028663 | -12413.000000 | -289.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
max | 456255.000000 | 1.000000 | 19.000000 | 1.170000e+08 | 4.050000e+06 | 258025.500000 | 4.050000e+06 | 0.072508 | -7489.000000 | 365243.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 9.000000 | 8.000000 | 27.000000 | 261.000000 | 25.000000 |
8 rows × 106 columns
<---->
SK_ID_CURR | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 48744.000000 | 48744.000000 | 4.874400e+04 | 4.874400e+04 | 48720.000000 | 4.874400e+04 | 48744.000000 | 48744.000000 | 48744.000000 | 48744.000000 | ... | 48744.000000 | 48744.0 | 48744.0 | 48744.0 | 42695.000000 | 42695.000000 | 42695.000000 | 42695.000000 | 42695.000000 | 42695.000000 |
mean | 277796.676350 | 0.397054 | 1.784318e+05 | 5.167404e+05 | 29426.240209 | 4.626188e+05 | 0.021226 | -16068.084605 | 67485.366322 | -4967.652716 | ... | 0.001559 | 0.0 | 0.0 | 0.0 | 0.002108 | 0.001803 | 0.002787 | 0.009299 | 0.546902 | 1.983769 |
std | 103169.547296 | 0.709047 | 1.015226e+05 | 3.653970e+05 | 16016.368315 | 3.367102e+05 | 0.014428 | 4325.900393 | 144348.507136 | 3552.612035 | ... | 0.039456 | 0.0 | 0.0 | 0.0 | 0.046373 | 0.046132 | 0.054037 | 0.110924 | 0.693305 | 1.838873 |
min | 100001.000000 | 0.000000 | 2.694150e+04 | 4.500000e+04 | 2295.000000 | 4.500000e+04 | 0.000253 | -25195.000000 | -17463.000000 | -23722.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 188557.750000 | 0.000000 | 1.125000e+05 | 2.606400e+05 | 17973.000000 | 2.250000e+05 | 0.010006 | -19637.000000 | -2910.000000 | -7459.250000 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 277549.000000 | 0.000000 | 1.575000e+05 | 4.500000e+05 | 26199.000000 | 3.960000e+05 | 0.018850 | -15785.000000 | -1293.000000 | -4490.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
75% | 367555.500000 | 1.000000 | 2.250000e+05 | 6.750000e+05 | 37390.500000 | 6.300000e+05 | 0.028663 | -12496.000000 | -296.000000 | -1901.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 |
max | 456250.000000 | 20.000000 | 4.410000e+06 | 2.245500e+06 | 180576.000000 | 2.245500e+06 | 0.072508 | -7338.000000 | 365243.000000 | 0.000000 | ... | 1.000000 | 0.0 | 0.0 | 0.0 | 2.000000 | 2.000000 | 2.000000 | 6.000000 | 7.000000 | 17.000000 |
8 rows × 105 columns
# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)
NAME_CONTRACT_TYPE 2 CODE_GENDER 3 FLAG_OWN_CAR 2 FLAG_OWN_REALTY 2 NAME_TYPE_SUITE 7 NAME_INCOME_TYPE 8 NAME_EDUCATION_TYPE 5 NAME_FAMILY_STATUS 6 NAME_HOUSING_TYPE 6 OCCUPATION_TYPE 18 WEEKDAY_APPR_PROCESS_START 7 ORGANIZATION_TYPE 58 FONDKAPREMONT_MODE 4 HOUSETYPE_MODE 3 WALLSMATERIAL_MODE 7 EMERGENCYSTATE_MODE 2 dtype: int64
# Function to calculate missing values by column# Funct // credits Will Koehrsen
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# Print some summary information
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
# Return the dataframe with missing information
return mis_val_table_ren_columns
# Missing values statistics
missing_values = missing_values_table(app_train)
missing_values.head(12)
Your selected dataframe has 122 columns. There are 67 columns that have missing values.
Missing Values | % of Total Values | |
---|---|---|
COMMONAREA_MEDI | 214865 | 69.9 |
COMMONAREA_AVG | 214865 | 69.9 |
COMMONAREA_MODE | 214865 | 69.9 |
NONLIVINGAPARTMENTS_MEDI | 213514 | 69.4 |
NONLIVINGAPARTMENTS_MODE | 213514 | 69.4 |
NONLIVINGAPARTMENTS_AVG | 213514 | 69.4 |
FONDKAPREMONT_MODE | 210295 | 68.4 |
LIVINGAPARTMENTS_MODE | 210199 | 68.4 |
LIVINGAPARTMENTS_MEDI | 210199 | 68.4 |
LIVINGAPARTMENTS_AVG | 210199 | 68.4 |
FLOORSMIN_MODE | 208642 | 67.8 |
FLOORSMIN_MEDI | 208642 | 67.8 |
# cols_to_drop = list((app_train.isnull().sum() > 75000).index)
cols_to_drop = [c for c in app_train.columns if app_train[c].isnull().sum() > 75000]
app_train, app_test = app_train.drop(cols_to_drop, axis=1), app_test.drop(cols_to_drop, axis=1)
app_test.isnull().sum().sort_values(ascending=False).head(10)
EXT_SOURCE_3 8668 AMT_REQ_CREDIT_BUREAU_YEAR 6049 AMT_REQ_CREDIT_BUREAU_QRT 6049 AMT_REQ_CREDIT_BUREAU_MON 6049 AMT_REQ_CREDIT_BUREAU_WEEK 6049 AMT_REQ_CREDIT_BUREAU_DAY 6049 AMT_REQ_CREDIT_BUREAU_HOUR 6049 NAME_TYPE_SUITE 911 DEF_60_CNT_SOCIAL_CIRCLE 29 OBS_30_CNT_SOCIAL_CIRCLE 29 dtype: int64
obj_cols = app_train.select_dtypes('object').columns
obj_cols
Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE'], dtype='object')
# filling string cols with 'Not specified'
app_train[obj_cols] = app_train[obj_cols].fillna('Not specified')
app_test[obj_cols] = app_test[obj_cols].fillna('Not specified')
float_cols = app_train.select_dtypes('float').columns
float_cols
Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'], dtype='object')
# filling float values with median of train (not test)
app_train[float_cols] = app_train[float_cols].fillna(app_train[float_cols].median())
app_test[float_cols] = app_test[float_cols].fillna(app_test[float_cols].median())
app_train.shape, app_test.shape
((307511, 72), (48744, 71))
# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)
NAME_CONTRACT_TYPE 2 CODE_GENDER 3 FLAG_OWN_CAR 2 FLAG_OWN_REALTY 2 NAME_TYPE_SUITE 8 NAME_INCOME_TYPE 8 NAME_EDUCATION_TYPE 5 NAME_FAMILY_STATUS 6 NAME_HOUSING_TYPE 6 WEEKDAY_APPR_PROCESS_START 7 ORGANIZATION_TYPE 58 dtype: int64
app_train['DAYS_EMPLOYED'].describe()
count 307511.000000 mean 63815.045904 std 141275.766519 min -17912.000000 25% -2760.000000 50% -1213.000000 75% -289.000000 max 365243.000000 Name: DAYS_EMPLOYED, dtype: float64
sns.distplot(app_train['DAYS_EMPLOYED'], kde=False);
plt.show()
print('The non-anomalies default on %0.2f%% of loans' % (100 * app_train[app_train['DAYS_EMPLOYED'] != 365243]['TARGET'].mean()))
print('The anomalies default on %0.2f%% of loans' % (100 * app_train[app_train['DAYS_EMPLOYED'] == 365243]['TARGET'].mean()))
print('There are %d anomalous days of employment' % len(app_train[app_train['DAYS_EMPLOYED'] == 365243]))
The non-anomalies default on 8.66% of loans The anomalies default on 5.40% of loans There are 55374 anomalous days of employment
# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
# Replace the anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
sns.distplot(app_train['DAYS_EMPLOYED'].dropna(), kde=False);
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)
print('Il y a %d anomalies dans le test data sur %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))
Il y a 9274 anomalies dans le test data sur 48744 entries
# refilling float values with median of train (not test)
app_train[float_cols] = app_train[float_cols].apply(pd.to_numeric, errors='coerce')
app_train = app_train.fillna(app_train.median())
app_test[float_cols] = app_test[float_cols].apply(pd.to_numeric, errors='coerce')
app_test = app_train.fillna(app_test.median())
Le coeficient de corrélation n'est pas la méthode la plus adapté pour dire l'importance d'une feature, mais peut donner une idée des relations
correlations = app_train.corr()['TARGET'].sort_values()
print('Most Positive Correlations:\n', correlations.tail(10))
print('\n\nMost Negative Correlations:\n', correlations.head(10))
Most Positive Correlations: REG_CITY_NOT_LIVE_CITY 0.044395 FLAG_EMP_PHONE 0.045982 REG_CITY_NOT_WORK_CITY 0.050994 DAYS_ID_PUBLISH 0.051457 DAYS_LAST_PHONE_CHANGE 0.055218 REGION_RATING_CLIENT 0.058899 REGION_RATING_CLIENT_W_CITY 0.060893 DAYS_EMPLOYED 0.063368 DAYS_BIRTH 0.078239 TARGET 1.000000 Name: TARGET, dtype: float64 Most Negative Correlations: EXT_SOURCE_2 -0.160295 EXT_SOURCE_3 -0.155892 DAYS_EMPLOYED_ANOM -0.045987 AMT_GOODS_PRICE -0.039623 REGION_POPULATION_RELATIVE -0.037227 AMT_CREDIT -0.030369 FLAG_DOCUMENT_6 -0.028602 HOUR_APPR_PROCESS_START -0.024166 FLAG_PHONE -0.023806 AMT_REQ_CREDIT_BUREAU_MON -0.014794 Name: TARGET, dtype: float64
# Compute the correlation matrix
corr = app_train.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(21, 19))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
<AxesSubplot:>
# Find the correlation of the positive days since birth and target
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])
-0.07823930830984513
plt.figure(figsize = (12, 6))
# KDE plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')
# KDE plot of loans which were not repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')
# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');
# Age information into a separate dataframe
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365
# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_data.head(10)
TARGET | DAYS_BIRTH | YEARS_BIRTH | YEARS_BINNED | |
---|---|---|---|---|
0 | 1 | 9461 | 25.920548 | (25.0, 30.0] |
1 | 0 | 16765 | 45.931507 | (45.0, 50.0] |
2 | 0 | 19046 | 52.180822 | (50.0, 55.0] |
3 | 0 | 19005 | 52.068493 | (50.0, 55.0] |
4 | 0 | 19932 | 54.608219 | (50.0, 55.0] |
5 | 0 | 16941 | 46.413699 | (45.0, 50.0] |
6 | 0 | 13778 | 37.747945 | (35.0, 40.0] |
7 | 0 | 18850 | 51.643836 | (50.0, 55.0] |
8 | 0 | 20099 | 55.065753 | (55.0, 60.0] |
9 | 0 | 14469 | 39.641096 | (35.0, 40.0] |
# Group by the bin and calculate averages
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups
TARGET | DAYS_BIRTH | YEARS_BIRTH | |
---|---|---|---|
YEARS_BINNED | |||
(20.0, 25.0] | 0.123036 | 8532.795625 | 23.377522 |
(25.0, 30.0] | 0.111436 | 10155.219250 | 27.822518 |
(30.0, 35.0] | 0.102814 | 11854.848377 | 32.479037 |
(35.0, 40.0] | 0.089414 | 13707.908253 | 37.555913 |
(40.0, 45.0] | 0.078491 | 15497.661233 | 42.459346 |
(45.0, 50.0] | 0.074171 | 17323.900441 | 47.462741 |
(50.0, 55.0] | 0.066968 | 19196.494791 | 52.593136 |
(55.0, 60.0] | 0.055314 | 20984.262742 | 57.491131 |
(60.0, 65.0] | 0.052737 | 22780.547460 | 62.412459 |
(65.0, 70.0] | 0.037270 | 24292.614340 | 66.555108 |
plt.figure(figsize = (8, 6))
# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])
# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');
2-Préparations des données
app_train = pd.get_dummies(data=app_train, columns=obj_cols)
app_test = pd.get_dummies(data=app_test, columns=obj_cols)
# back up of the target / need to keep this information
y = app_train.TARGET
app_train = app_train.drop(columns=['TARGET'])
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
app_train.shape, app_test.shape
((307511, 168), (307511, 168))
feat_to_scale = list(float_cols).copy()
feat_to_scale.extend(['CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'HOUR_APPR_PROCESS_START'])
feat_to_scale
['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'HOUR_APPR_PROCESS_START']
scaler = StandardScaler()
app_train[feat_to_scale] = scaler.fit_transform(app_train[feat_to_scale])
app_test[feat_to_scale] = scaler.fit_transform(app_test[feat_to_scale])
app_train.head()
SK_ID_CURR | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | ... | ORGANIZATION_TYPE_Trade: type 4 | ORGANIZATION_TYPE_Trade: type 5 | ORGANIZATION_TYPE_Trade: type 6 | ORGANIZATION_TYPE_Trade: type 7 | ORGANIZATION_TYPE_Transport: type 1 | ORGANIZATION_TYPE_Transport: type 2 | ORGANIZATION_TYPE_Transport: type 3 | ORGANIZATION_TYPE_Transport: type 4 | ORGANIZATION_TYPE_University | ORGANIZATION_TYPE_XNA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | -0.577538 | 0.142129 | -0.478095 | -0.166143 | -0.507236 | -0.149452 | -1.506880 | 0.755835 | 0.379837 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 100003 | -0.577538 | 0.426792 | 1.725450 | 0.592683 | 1.600873 | -1.252750 | 0.166821 | 0.497899 | 1.078697 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 100004 | -0.577538 | -0.427196 | -1.152888 | -1.404669 | -1.092145 | -0.783451 | 0.689509 | 0.948701 | 0.206116 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 100006 | -0.577538 | -0.142533 | -0.711430 | 0.177874 | -0.653463 | -0.928991 | 0.680114 | -0.368597 | -1.375829 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 100007 | -0.577538 | -0.199466 | -0.213734 | -0.361749 | -0.068554 | 0.563570 | 0.892535 | -0.368129 | 0.191639 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 168 columns
3-Algorithme machine learning
X_train, X_test, y_train, y_test = train_test_split(app_train, y, test_size=0.2)
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV
nbSplits=2
nbRepeats=2
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spaceDummyClf = [{'constant': [None], 'random_state': [None], 'strategy': ["stratified", "most_frequent", "prior", "uniform","constant"]}]
search= RandomizedSearchCV(estimator = dummy_clf, param_distributions = spaceDummyClf, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(app_train, y)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))
Fitting 4 folds for each of 5 candidates, totalling 20 fits roc: 0.5 F1: 0.8780010382781587
plot_confusion_matrix(dummy_clf, X_test, y_test)
plt.show()
nbSplits=2
nbRepeats=2
model_randomforest = RandomForestClassifier()
model_randomforest.fit(X_train,y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spacerandomforest = [{'bootstrap': [True], 'ccp_alpha': [0.0], 'class_weight': ['balanced'], 'criterion': ['gini'], 'max_depth': [None], 'max_features': ['auto'], 'max_leaf_nodes': [None], 'max_samples': [None], 'min_impurity_decrease': [0.0], 'min_samples_leaf': [1], 'min_samples_split':[2], 'min_weight_fraction_leaf': [0.0], 'n_estimators': [10,50], 'n_jobs': [-1,2], 'oob_score': [False], 'random_state': [None],'warm_start': [False]}]
search= RandomizedSearchCV(estimator = model_randomforest, param_distributions = spacerandomforest, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(X_train, y_train)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))
Fitting 4 folds for each of 4 candidates, totalling 16 fits roc: 0.5000808004690401 F1: 0.878024753363463
plot_confusion_matrix(model_randomforest, X_test, y_test)
plt.show()
nbSplits=2
nbRepeats=2
model_lightgbm = lgb.LGBMClassifier()
model_lightgbm.fit(X_train,y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spacelightgbm = [{'boosting_type': ['gbdt'], 'class_weight': ['balanced'], 'colsample_bytree': [1.0], 'importance_type': ['split'], 'learning_rate': [0.1,0.2], 'max_depth': [-1], 'n_estimators': [50], 'n_jobs': [-1], 'num_leaves': [31], 'objective': [None], 'random_state': [50], 'reg_alpha': [0.0,0.2], 'reg_lambda': [0.0,0.2], 'silent': [True], 'subsample': [1.0], }]
search= RandomizedSearchCV(estimator = model_lightgbm, param_distributions = spacelightgbm, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(X_train, y_train)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))
Fitting 4 folds for each of 8 candidates, totalling 32 fits roc: 0.6860200208050994 F1: 0.763578053725843
plot_confusion_matrix(model_lightgbm, X_test, y_test)
plt.show()
nbSplits=2
nbRepeats=2
model_xgBoost = xgb.XGBClassifier()
model_xgBoost.fit(X_train, y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spaceXgBoost = [{'objective':["binary:logistic"], 'random_state':[10,20], 'eval_metric':['auc'], 'max_delta_step':[2,4], 'scale_pos_weight':[10,20]}]
search= RandomizedSearchCV(estimator = model_xgBoost, param_distributions = spaceXgBoost, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(X_train, y_train)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))
[12:13:14] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Fitting 4 folds for each of 8 candidates, totalling 32 fits roc: 0.6758434428493557 F1: 0.8046231342459104
plot_confusion_matrix(model_xgBoost, X_test, y_test)
plt.show()
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
RandomForestClassifier()
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(app_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
Feature ranking: 1. feature 27 (0.064638) 2. feature 28 (0.059709) 3. feature 10 (0.046448) 4. feature 7 (0.046111) 5. feature 9 (0.045583) 6. feature 0 (0.044521) 7. feature 4 (0.042292) 8. feature 8 (0.041667) 9. feature 33 (0.040434) 10. feature 3 (0.039350) 11. feature 2 (0.035407) 12. feature 6 (0.035158) 13. feature 5 (0.034121) 14. feature 20 (0.031223) 15. feature 59 (0.022319) 16. feature 29 (0.017723) 17. feature 31 (0.017540) 18. feature 17 (0.013928) 19. feature 1 (0.009987) 20. feature 58 (0.008297) 21. feature 57 (0.007449) 22. feature 15 (0.006855) 23. feature 68 (0.006735) 24. feature 69 (0.006599) 25. feature 92 (0.006576) 26. feature 30 (0.006573) 27. feature 18 (0.006403) 28. feature 115 (0.006391) 29. feature 19 (0.006384) 30. feature 108 (0.006329) 31. feature 109 (0.006230) 32. feature 13 (0.006218) 33. feature 107 (0.006087) 34. feature 103 (0.006029) 35. feature 104 (0.005855) 36. feature 152 (0.005557) 37. feature 77 (0.005529) 38. feature 32 (0.005298) 39. feature 66 (0.005267) 40. feature 85 (0.005192) 41. feature 67 (0.005167) 42. feature 25 (0.005072) 43. feature 26 (0.004986) 44. feature 105 (0.004938) 45. feature 94 (0.004925) 46. feature 35 (0.004904) 47. feature 91 (0.004711) 48. feature 71 (0.004656) 49. feature 90 (0.004614) 50. feature 79 (0.004465) 51. feature 24 (0.004332) 52. feature 98 (0.004290) 53. feature 87 (0.004091) 54. feature 64 (0.003997) 55. feature 63 (0.003789) 56. feature 93 (0.003625) 57. feature 16 (0.003462) 58. feature 106 (0.003423) 59. feature 143 (0.003306) 60. feature 102 (0.003058) 61. feature 40 (0.002678) 62. feature 114 (0.002667) 63. feature 117 (0.002584) 64. feature 99 (0.002551) 65. feature 76 (0.002528) 66. feature 56 (0.002501) 67. feature 22 (0.002487) 68. feature 161 (0.002461) 69. feature 121 (0.002383) 70. feature 23 (0.002191) 71. feature 82 (0.002159) 72. feature 140 (0.002140) 73. feature 96 (0.002066) 74. feature 165 (0.002030) 75. feature 88 (0.002004) 76. feature 101 (0.001941) 77. feature 113 (0.001895) 78. feature 61 (0.001878) 79. feature 62 (0.001842) 80. feature 149 (0.001798) 81. feature 38 (0.001774) 82. feature 89 (0.001683) 83. feature 130 (0.001682) 84. feature 138 (0.001600) 85. feature 157 (0.001538) 86. feature 150 (0.001428) 87. feature 37 (0.001400) 88. feature 111 (0.001339) 89. feature 164 (0.001303) 90. feature 70 (0.001274) 91. feature 21 (0.001249) 92. feature 126 (0.001238) 93. feature 123 (0.001230) 94. feature 148 (0.001187) 95. feature 136 (0.001077) 96. feature 163 (0.001045) 97. feature 81 (0.001025) 98. feature 75 (0.001023) 99. feature 60 (0.000999) 100. feature 167 (0.000991) 101. feature 145 (0.000983) 102. feature 55 (0.000968) 103. feature 100 (0.000959) 104. feature 12 (0.000958) 105. feature 54 (0.000826) 106. feature 124 (0.000807) 107. feature 134 (0.000801) 108. feature 153 (0.000745) 109. feature 48 (0.000734) 110. feature 141 (0.000726) 111. feature 131 (0.000721) 112. feature 144 (0.000718) 113. feature 112 (0.000704) 114. feature 156 (0.000692) 115. feature 50 (0.000624) 116. feature 97 (0.000595) 117. feature 74 (0.000588) 118. feature 122 (0.000532) 119. feature 166 (0.000527) 120. feature 41 (0.000488) 121. feature 151 (0.000476) 122. feature 119 (0.000449) 123. feature 154 (0.000440) 124. feature 73 (0.000436) 125. feature 146 (0.000404) 126. feature 155 (0.000392) 127. feature 110 (0.000354) 128. feature 132 (0.000343) 129. feature 43 (0.000323) 130. feature 72 (0.000314) 131. feature 120 (0.000306) 132. feature 14 (0.000304) 133. feature 142 (0.000304) 134. feature 129 (0.000303) 135. feature 137 (0.000300) 136. feature 116 (0.000291) 137. feature 118 (0.000282) 138. feature 160 (0.000268) 139. feature 139 (0.000261) 140. feature 45 (0.000247) 141. feature 46 (0.000190) 142. feature 51 (0.000164) 143. feature 127 (0.000157) 144. feature 52 (0.000133) 145. feature 47 (0.000128) 146. feature 162 (0.000123) 147. feature 53 (0.000119) 148. feature 84 (0.000118) 149. feature 133 (0.000096) 150. feature 147 (0.000091) 151. feature 125 (0.000080) 152. feature 34 (0.000073) 153. feature 128 (0.000067) 154. feature 135 (0.000052) 155. feature 86 (0.000049) 156. feature 159 (0.000048) 157. feature 39 (0.000044) 158. feature 49 (0.000031) 159. feature 158 (0.000030) 160. feature 80 (0.000018) 161. feature 83 (0.000002) 162. feature 42 (0.000001) 163. feature 36 (0.000001) 164. feature 44 (0.000001) 165. feature 65 (0.000000) 166. feature 78 (0.000000) 167. feature 11 (0.000000) 168. feature 95 (0.000000)
# Plot the feature importances of the rf
plt.figure(figsize=(16, 8))
plt.title("Feature importances")
plt.bar(range(app_train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(app_train.shape[1]), indices)
plt.xlim([-1, app_train.shape[1]])
plt.show()
(pd.Series(rf.feature_importances_, index=app_train.columns)
.nlargest(15)
.plot(kind='barh'))
<AxesSubplot:>
xgb.plot_importance(model_xgBoost,height=100,max_num_features=12)
<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>
# fit on the whole dataset of train
xgbm.fit(app_train, y)
# Make predictions & make sure to select the second column only
result = xgbm.predict_proba(app_test)[:, 1]
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = result
submit.head(5)